In this project, I will work with two poweful Python packages, Pandas and Seaborn. Both packages have extensive online documentation. There is an extensive tutorial on Visualization with Pandas. The Seaborn tutorial contains many examples of data Visualization. The matplotlib web site has addition resources for learning plotting with Python tools.
In this project, I will you use two powerful Python packages. Pandas and Seaborn...
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import seaborn as sb
import tweepy
import requests
import os
import json
import time
%matplotlib inline
from requests import get
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
df_archive = pd.read_csv('./twitter-archive-enhanced.csv')
df_archive.shape
df_archive.describe()
df_archive.info()
df_archive.tweet_id.nunique()
df_archive.isnull().sum()
# Importing the tweet image predictions TSV file into a DataFrame
predict_img_df = pd.read_csv('image_predictions.tsv', sep='\t')
# Download tweet image predictions TSV using the Requests library and write it to image_predictions.tsv
response = requests.get("https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv")
response
# Download the page
#url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
#response = get(url)
#print(response.text[:1000])
# print statistics
# high-level overview of data shape and composition
print('Rows and columns: ' + str(predict_img_df.shape))
print('')
print(predict_img_df.dtypes)
predict_img_df.info()
predict_img_df.head()
twitter_archive_master = pd.read_csv('twitter_archive_master.csv')
print('Rows: and columns: ' + str(twitter_archive_master.shape))
twitter_archive_master.info()
twitter_archive_master.describe()
# !pip install tweepy
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
# app login
consumer_key = 'BCf0alB97VE7gvB4AZ9HNyZTu'
consumer_secret = 'BuAXliR0v6adMRgZVPYAS22P4CVY2HWIsPuBctqR56kmXf1Wus'
# user login
access_token = '1304789419711508480-yMou4A8k9qXe0mJvJIIGCK8sZYAQix'
access_secret = 'acxZ9SAdZ40jK7Iwpuw0VpzWKPJHbWQZRJEKqUosmvLxX'
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Iterate through all of the authenticated user's friends
#for friend in tweepy.Cursor(api.friends).items():
# Process the friend here
#print(friend)
screen_name = "dog_rates"
statuses = api.user_timeline(screen_name)
print("retrieved: ", len(statuses))
status = statuses[0]
data_list = []
for status in statuses:
data_list.append(status._json)
df = pd.DataFrame(data_list)
df
The Twitter API gives developers access to most of Twitter’s functionality. You can use the API to read and write information related to Twitter entities such as tweets, users, and trends. Twitter API runs without error
df.describe()
# List of dictionaries to read tweet's JSON data line by line and later convert to a DataFrame
df_list = []
with open('tweet-json copy', 'r') as json_file:
for line in json_file:
status = json.loads(line)
# Append to list of dictionaries
df_list.append({'tweet_id': status['id'],
'retweet_count': status['retweet_count'],
'favorite_count': status['favorite_count'],
'display_text_range': status['display_text_range']
})
# Create a DataFrame with tweet ID, retweet count, favorite count and display_text_range
status_df = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count', 'display_text_range'])
status_df
status_df.shape
status_df.info()
plt.figure(figsize=(20,10))
sns.heatmap(df_archive.isna(), cbar=False)
As we can notice, there is a lot of missing data in the columns concerning the response and retweeted status. Since we only want the original messages with images, we have to delete them later - the missing data in the "expanded_urls" column will also disappear with this cleanup operation.
for col in df_archive.select_dtypes('float64'):
plt.figure()
sns.distplot(df_archive[col])
for col in df_archive.select_dtypes('object'):
print(f'{col :-<50} {df_archive[col].nunique()}')
df_archive['doggo'].value_counts()
df_archive['floofer'].value_counts()
df_archive['puppo'].value_counts()
df_archive['pupper'].value_counts()
Only for 16% of the rows the data is not missing. Now let's take a look at the ratings. By what we have seen so far, it looks like the ratings have always a format of 13/10 or 12/10 and so on. So we would expect a numerator > 10 and denominator = 10.
df_archive.nunique()
for col in df_archive.select_dtypes('object'):
print(f'{col :-<50} {df_archive[col].value_counts(10)}')
for col in df_archive.select_dtypes('int64'):
plt.figure()
sns.distplot(df_archive[col])
len(df_archive[df_archive.retweeted_status_id.isnull() == False])
# Checking if there are any records in df whose corresponding record with same tweet_id is missing in img_df table
len(df_archive[~df_archive.tweet_id.isin(predict_img_df.tweet_id)])
df_archive.name.value_counts().sort_index(ascending=False)
# Sort by rating_denominator values
df_archive.rating_denominator.value_counts().sort_index()
df_archive.name.value_counts()
print(df_archive.describe())
df_archive.info()
predict_img_df.info()
predict_img_df.columns
predict_img_df.info()
plt.figure(figsize=(8,6))
predict_img_df.dtypes.value_counts().plot.pie()
for col in predict_img_df.select_dtypes('float64'):
plt.figure()
sns.distplot(predict_img_df[col])
predict_img_df.describe()
min and max of prediction are well in 0-1 range, as expected. And confidence decrease from first to third prediction.
for col in predict_img_df.select_dtypes('object'):
print(f'{col :-<50} {predict_img_df[col].nunique()}')
The best way to find duplicates is to look at the jpg - url. If there are value counts > 1, then this data contains duplicates/retweets and only want Tweets with pictures which contain dogs. Let's see if there are pictures, for which the ML - Algorithm didn't predict any dogs.
for col in predict_img_df.select_dtypes('object'):
print(f'{col :-<60} {predict_img_df[col].value_counts()}')
We saw that some entry have nothing to with dogs : minibus, fountain, desktop_computer, ... But we could identify them with the column "p1_dog".
predict_img_df.p1_dog.value_counts()
So, off 2 750 picture, 543 are identified as not dog.
plt.figure(figsize=(20,10))
sns.heatmap(predict_img_df.isna(), cbar=False)
predict_img_df.head()
plt.figure(figsize=(20,10))
sns.heatmap(status_df.isna(), cbar=False)
status_df.head()
status_df.info()
for col in status_df.select_dtypes('object'):
print(f'{col :-<50} {status_df[col].value_counts()}')
df_archive_clean = df_archive.copy()
predict_img_df_clean = predict_img_df.copy()
status_df_clean = status_df.copy()
Strip all html anchor tags (i.e. <a..>) in source column and retain just the text in between the tags. Convert the datatype from string to categorical.
df_archive_clean.source = df_archive_clean.source.str.replace(r'<(?:a\b[^>]*>|/a>)', '')
df_archive_clean.source = df_archive_clean.source.astype('category')
df_archive_clean.source.value_counts()
df_archive_clean: In the name column, there are several values that are not dog names, like 'a', 'the', 'such', etc. We notice that all of these observations have lowercase characters. We should clean up this field.
mask = [not x.islower() if isinstance(x,str)
and len(x)>0 else False
for x in df_archive_clean['name']]
df_archive_clean = df_archive_clean[mask]
# Checking for invalid names
unique_names = df_archive_clean['name'].unique()
print("<a> in names?", "a" in unique_names)
print("<the> in names?", "the" in unique_names)
unique_names
Change in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id and retweeted_status_user_id to uint64 and use 0 as not-available
Change timestamp and retweeted_status_timestamp to datetimes
Convert data types to integer but ignore null values
def convert_to_uint64(df, column):
tmp = pd.Series(index=df.index, dtype="uint64")
non_na_mask = ~df[column].isna()
non_na = df[column][non_na_mask].astype("uint64")
tmp[non_na_mask] = non_na
return tmp
for column in ["in_reply_to_status_id", "in_reply_to_user_id", "retweeted_status_id", "retweeted_status_user_id"]:
df_archive_clean[column] = convert_to_uint64(df_archive_clean, column)
df_archive_clean.timestamp = pd.to_datetime(df_archive_clean.timestamp)
df_archive_clean.retweeted_status_timestamp = pd.to_datetime(df_archive_clean.retweeted_status_timestamp)
# Check data types are now correct
df_archive_clean.info()
Use tweet_id to get expanded_urls
If expanded_urls is null, add by using tweet id
df_archive_clean['expanded_urls'].isna().sum()
def add_urls(row):
# Don't do anything if expanded_urls is not null
if pd.notnull(row['expanded_urls']):
return row
else:
# Get tweet_id
tweet_id = row['tweet_id']
# Make new URL and save into expanded_urls
row['expanded_urls'] = 'https://twitter.com/dog_rates/status/{}'.format(tweet_id)
# return updated row
return row
# Save dataframe with missing urls
df_archive_clean = df_archive_clean.apply(add_urls, axis=1)
Check that there are no empty expanded_urls
print(df_archive_clean[df_archive_clean['expanded_urls'].isnull()])
print(df_archive_clean['expanded_urls'].isna().sum())
df_archive_clean['expanded_urls']
df_archive_clean: contains retweets and therefore, duplicates
Keep only those rows in df_archive_clean table that are original tweets and NOT retweets (i.e. retweeted_status_id column is null). Delete the rest.
# Note: 0 represent missing ids
len(df_archive_clean[df_archive_clean.retweeted_status_id != 0])
df_archive_clean = df_archive_clean[df_archive_clean.retweeted_status_id==0]
len(df_archive_clean[df_archive_clean.retweeted_status_id != 0])
Remove 343th entry from dataframe
# Drop 343rd entry (index=342 since starts at zero)
df_archive_clean = df_archive_clean.drop(df_archive_clean.index[342])
# Check entries about point
df_archive_clean.iloc[340:343]
Find when text has dog stages in it and add it to the corresponding column
for col in ['pupper', 'doggo', 'puppo', 'floofer']:
print(col)
print(df_archive_clean[col].value_counts())
# Find when 'dog stage' is used in text column
def extend_dog_stages(row):
# Only do something if <dog_stage> is in text
for dog_stage in ['pupper', 'doggo', 'puppo', 'floofer']:
if dog_stage in text:
row[dog_stage] = dog_stage
# return row whether or not it was updated
return row
for col in ['pupper', 'doggo', 'puppo', 'floofer']:
print(col)
print(df_archive_clean[col].value_counts())
Finding entries that are retweets by matching text pattern 'RT @...' and drop entries from the table
# Find all the retweets
retweets = df_archive_clean.text.str.match('^RT @')
# Only keep the tweets that aren't retweets
df_archive_clean = df_archive_clean[~retweets]
# Test that we have no retweets
retweets = df_archive_clean.text.str.match('^RT @')
df_archive_clean[retweets]
Get the decimal ratings by matching the text pattern "#.#/#" and save as new rating (only denominators had decimals)
print(list(df_archive_clean['rating_numerator'].unique()))
# Apply function to redo rating if a denominator is found
import re
def get_decimal_rating(row):
#if text matches for numerator, change it
decimal_numerator_pattern = '(\d+\.\d+)\/(\d+)'
matches = re.search(decimal_numerator_pattern, row['text'])
if matches != None:
row['rating_numerator'] = float(matches.group(1))
row['rating_denominator'] = int(matches.group(2))
return row
df_archive_clean = df_archive_clean.apply(get_decimal_rating, axis=1)
print(list(df_archive_clean['rating_numerator'].unique()))
# 13.5, 11.27, 11.26 are now present
Most ratings are expected to have a denominator of 10 and a numerator around 0...20.
We have a look at potential outliers and manually suppress incorrect tweets.
df_archive.rating_numerator.value_counts()
df_archive.rating_denominator.value_counts()
pd.set_option('display.max_colwidth', -1)
rating_ratio = df_archive_clean.rating_numerator / df_archive_clean.rating_denominator
mask = (rating_ratio < 0) | (rating_ratio > 2.0)
df_archive_clean[['rating_numerator', 'rating_denominator', 'text']][mask]
# suppression of rows 313 and 516
df_archive_clean = df_archive_clean.drop([313, 516])
pd.set_option('display.max_colwidth', -1)
rating_ratio = df_archive_clean.rating_numerator / df_archive_clean.rating_denominator
mask = (rating_ratio < 0) | (rating_ratio > 2.0)
df_archive_clean[['rating_numerator', 'rating_denominator', 'text']][mask]
Keep only those records in arc_df table whose tweet_id exists in img_df table
df_archive_clean = df_archive_clean[df_archive_clean.tweet_id.isin(predict_img_df.tweet_id)]
len(df_archive_clean[~df_archive_clean.tweet_id.isin(predict_img_df.tweet_id)])
df_archive_clean.info()
Drop retweeted_status_id, retweeted_status_user_id and retweeted_status_timestamp columns from df_archive_clean table
df_archive_clean.drop(['retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis=1, inplace=True)
df_archive_clean.info()
One variable in four columns (doggo, floofer, pupper, and puppo)
doggo, floofer, pupper and puppo columns in df_archive_clean table should be merged into one column named "dog_stage"
len(df_archive_clean[(df_archive_clean.doggo != 'None') & (df_archive_clean.floofer != 'None')])
len(df_archive_clean[(df_archive_clean.doggo != 'None') & (df_archive_clean.pupper != 'None')])
len(df_archive_clean[(df_archive_clean.doggo != 'None') & (df_archive_clean.puppo != 'None')])
len(df_archive_clean[(df_archive_clean.floofer != 'None') & (df_archive_clean.pupper != 'None')])
len(df_archive_clean[(df_archive_clean.floofer != 'None') & (df_archive_clean.puppo != 'None')])
len(df_archive_clean[(df_archive_clean.pupper != 'None') & (df_archive_clean.puppo != 'None')])
# Dog stages into one columns!
df_tmp = df_archive_clean.copy()
df_tmp['dog_stages'] = None
for col in ['doggo', 'floofer', 'pupper', 'puppo']:
mask = df_tmp[col]==col
df_tmp['dog_stages'][mask] = col
mask_doggo_floofer = (df_archive_clean.doggo != 'None') & (df_archive_clean.floofer != 'None')
df_tmp['dog_stages'][mask_doggo_floofer] = 'doggo|floofer'
mask_doggo_pupper = (df_archive_clean.doggo != 'None') & (df_archive_clean.pupper != 'None')
df_tmp['dog_stages'][mask_doggo_pupper] = 'doggo|pupper'
mask_doggo_puppo = (df_archive_clean.doggo != 'None') & (df_archive_clean.puppo != 'None')
df_tmp['dog_stages'][mask_doggo_puppo] = 'doggo|puppo'
df_tmp['dog_stages'].value_counts()
df_archive_clean = df_tmp
# drop old columns
df_archive_clean = df_archive_clean.drop(['doggo', 'pupper', 'puppo', 'floofer'], axis=1)
df_archive_clean['dog_stages'].value_counts()
Replace the value 'None' with NaN (missing value)
#replace the value 'None' with NaN (missing value)
df_archive_clean = df_archive_clean.replace('None', np.nan)
df_archive_clean.isnull().sum()
'None' previously passed the test as a name but isn't one. Drop missing tweets with missing names: Previously 'None', now nan.
mask = ~df_archive_clean['name'].isna()
df_archive_clean = df_archive_clean[mask]
df_archive_clean.isnull().sum()
p1, p2, p3 inconsistent capitalization (sometimes first letter is capital)
Making all strings in p1, p2, and p3 lowercase
predict_img_df['p1'] = predict_img_df['p1'].str.lower()
predict_img_df['p2'] = predict_img_df['p2'].str.lower()
predict_img_df['p3'] = predict_img_df['p3'].str.lower()
predict_img_df.head()
Creating new dataframe by merging data from image predictions into a copy of twitter archive if the image prediction's tweet is already found in the twitter archive
# Only keep predictions that have ids in archive
# (final size is as large as archive)
df1 = df_archive_clean
df2 = predict_img_df_clean
final_df_clean = pd.merge(df1, df2, how='left', on=['tweet_id'])
final_df_clean.info()
# If tweet id (from image predictions) not in archive,
# then it isn't in final merged dataframe
tweet_ids = predict_img_df_clean.tweet_id
for tweet_id in tweet_ids:
# Test if id is also in archive
if tweet_id not in df_archive_clean.tweet_id.values:
# Check that it also isn't in new df
if tweet_id in final_df_clean.tweet_id.values:
print('Paradox! ID#{} in final DF but shouldn\'t be'.format(tweet_id))
status_df_clean.info()
Merge data from tweet info into a final dataframe if the tweet info's tweet is already found in the final dataframe (twitter archive)
Note that the 9 missing tweets are because the tweets have been removed.
# Only keep predictions that have ids in archive
# (final size is as large as archive)
df1 = final_df_clean
df2 = status_df_clean
df2.rename(columns={'id':'tweet_id'}, inplace=True)
final_df_clean = pd.merge(df1, df2, how='left', on=['tweet_id'])
# If tweet id (from tweet info) not in archive,
# then it isn't in final merged dataframe
tweet_ids = status_df_clean.tweet_id
for tweet_id in tweet_ids:
# Test if id is also in new df
if tweet_id not in df_archive_clean.tweet_id.values:
# Test if id is also in new df
if tweet_id in final_df_clean.tweet_id.values:
print('ID#{} in final DF but shouldn\'t be'.format(tweet_id))
final_df_clean.info()
final_df_clean.info()
Storing our final and clean dataframe into a CSV file twitter_archive_master.csv
final_df_clean.to_csv('twitter_archive_master.csv', encoding='utf-8', index=False)
final_df_clean = pd.read_csv('twitter_archive_master.csv')
final_df_clean['source'].value_counts()
WeRateDogs has posted 98% of the tweets from iPhone.
final_df_clean['rating_numerator'].value_counts().sort_index()
final_df_clean['rating_numerator'][final_df_clean['rating_numerator'] > 10].value_counts().sum()
Out of a total of 1349 dogs rated by WeRateDogs, a total of 824 were rated above 10/10, which is almost 61%. However, only 1 was rated the highest: 1776 .
Motivation: Which are the most common breeds? Are some dog breeds more common in tweets?
# Get attributes before determinig dog breed
attributes = ['retweet_count', 'favorite_count', 'rating_denominator', 'rating_numerator']
df_dog_breeds = final_df_clean[attributes].copy()
# Create rating column
def percent_rating(row):
if row['rating_denominator'] == 0:
return 0
return row['rating_numerator']/row['rating_denominator']
df_dog_breeds['rating'] = df_dog_breeds.apply(percent_rating, axis=1)
# For each entry, check which is the most likely breed
def best_breed_match(row):
# Defaults to compare against
breed = 'not_dog'
confidence = 0
# Only keep if it is a possible dog
dog_preds = [(row[['p1', 'p1_dog', 'p1_conf']]), (row[['p2', 'p2_dog', 'p2_conf']]), (row[['p3', 'p3_dog', 'p3_conf']])]
# Use this for easy reference
index_breed, index_isDog, index_conf = 0,1,2
for pred in dog_preds:
# If it's a dog breed, check if it's max confidence seen
if pred[index_isDog]:
# save breed and confidence if higher confidence
if pred[index_conf] >= confidence:
breed = pred[index_breed]
confidence = pred[index_conf]
# Update breed list
row['breed'] = breed
row['breed_conf'] = confidence
return row
breeds = [row['p1'], row['p2'], row['p3']]
df_possible_breeds = df_possible_breeds.apply(best_breed_match, axis=1)
# Save info into dog breed dataframe
df_dog_breeds['breed'] = df_possible_breeds['breed']
df_dog_breeds['breed_conf'] = df_possible_breeds['breed_conf']
# show 30 most common dog breeds
df_dog_breeds.breed.value_counts()[:30] # 'not_dog' is most common so keep this out of plot
# Plot the 12 most common dog breeds in bar chart
plt.figure(figsize=(15,10))
only_dogs = df_dog_breeds['breed'] != 'not_dog'
df_dog_breeds[only_dogs].breed.value_counts()[12::-1].plot(kind='barh')
We first look at the top 30 most common dog breeds. We see that nearly 500 tweets were classified as not being a dog, which is about 3 times more than the first real dog breed. It could be that many tweets in this ranking archive are not dogs, but this seems strange and unlikely. It is possible that the image predictions have misclassified many images as not being dogs. This seems more likely, especially considering that the data comes from an image prediction code that may have made significant errors.
We then plotted the 12 most common (real) breeds of dogs in a bar graph. We see that according to this data, the most common breed of dog in the tweets was the golden retriever with more than 150 tweets of the breed of dog. The other four most common breeds, in descending order, were Labrador retriever, Pembrooke (corgi), Chihuahua and Pug. The first three breeds seem to fit well with my general use of the Internet and my experience in viewing different images of dogs.
However, we must again be careful not to draw too many conclusions from this data, as we rely on the prediction code of the images to inform us about the breed of dog. It is possible that some breeds of dogs were more difficult to determine by the prediction code and should in fact be at the top of the list. In future surveys, the effectiveness of the image prediction data could be further tested to ensure that the data accurately reflects the different breeds of dogs.
plt.figure(figsize=(15,10))
final_df_clean['source'].value_counts().plot('barh', figsize=(11,5), title='Most used Twitter source').set_xlabel("Number of Tweets")
plt.savefig('twitter_source')
plt.figure(figsize=(15,10))
final_df_clean.name.value_counts()[1:7].plot('barh', figsize=(11,5), title='Top 6 common dog names').set_xlabel("Number of Dogs")
plt.savefig('dog_names')
final_df_clean.dog_stages.value_counts()
dog_counts = final_df_clean.dog_stages.value_counts()
fig,ax = plt.subplots(figsize = (15,10))
ax.bar(dog_counts.index, dog_counts.values, width = 0.8)
ax.set_ylabel('Dog Count')
ax.set_xlabel('Category')
plt.title("Most Common Dog Category")
plt.show()
plt.figure(figsize=(15,10))
sns.lmplot(x = 'rating_numerator', y = 'rating_denominator',
data = final_df_clean,
hue = 'source',
palette ='Set2', fit_reg = False)
retweet_count = final_df_clean.retweet_count
print("The median and mean retweet count are: ", retweet_count.median(), "and ", retweet_count.mean(), "respectively." )
favorite_count = final_df_clean.favorite_count
print("The median and mean favorite count are: ", favorite_count.median(), "and ", favorite_count.mean(), "respectively." )
rating = df_dog_breeds.rating
print("The median and mean rating are: ", rating.median(), "and ", rating.mean(), "respectively." )